1 Aggregated and atomic scores per method

2 read





list_wd <- strsplit(getwd(), '/')[[1]]
if (list_wd[length(list_wd)] == 'hadaca3_framework') {
  score_files <- list.files(path = "./output/scores/", full.names = TRUE)
} else {
  # score_files <- list.files(pattern = 'score-li*', full.names = TRUE)
  # score_files <- system("find . -maxdepth 1 -type f -name 'score-li*'", intern = TRUE)
  score_files <- dir_ls(".", regexp = "score-li.*")
}

plan(multisession,workers=25)
# plan(sequential)


process_file <- function(score_file) {
  base_name <- basename(score_file)
  components <- str_match(base_name, 
    "score-li-(.+)_(.+)_mixRNA_(.+)_(.+)_RNA_(.+)_(.+)_scRNA_(.+)_(.+)_(.+)_mixMET_(.+)_(.+)_MET_(.+)_(.+)_(.+)_(.+).h5")[2:16]

  # If file name doesn't match expected pattern, skip
  if (any(is.na(components))) return(NULL)

  scores <- tryCatch({
    s <- read_hdf5(score_file)
    gc()
    s
  }, error = function(e) {
    message("Error reading file: ", score_file)
    message(e)
    NULL
  })

  # scores <- tryCatch({
  #   read_hdf5(score_file)
  # }, error = function(e) return(NULL))

  if (is.null(scores)) return(NULL)

  cbind(
    data.frame(
      dataset = components[1],
      ref = components[2],
      preprocessing_mixRNA = components[3],
      feature_selection_mixRNA = components[4],
      preprocessing_RNA = components[5],
      feature_selection_RNA = components[6],
      preprocessing_scRNA = components[7],
      feature_selection_scRNA = components[8],
      deconvolution_rna = components[9],
      preprocessing_mixMET = components[10],
      feature_selection_mixMET = components[11],
      preprocessing_MET = components[12],
      feature_selection_MET = components[13],
      deconvolution_met = components[14],
      late_integration = components[15],
      stringsAsFactors = FALSE
    ),
    scores
  )
}

# Process files in parallel
# results_list <- lapply(score_files, process_file)

results_list <- future_map(score_files, function(f) {
  tryCatch(process_file(f), error = function(e) NULL)
})



# bind rows
results_li <- do.call(rbind, results_list)


results_li %>%
  # filter(dc==2) %>%
  group_by(late_integration) %>%
  summarise(GlobalScore = median(score_aggreg)) %>%
  arrange(desc(GlobalScore))
#> # A tibble: 5 × 2
#>   late_integration GlobalScore
#>   <chr>                  <dbl>
#> 1 OnlyMet                0.671
#> 2 limeanRMSE             0.671
#> 3 limean                 0.657
#> 4 liCtSens               0.657
#> 5 OnlyRna                0.542


results_li_arrange = results_li %>%
  group_by(preprocessing_mixRNA, feature_selection_mixRNA, 
           preprocessing_RNA, feature_selection_RNA, 
           preprocessing_scRNA, feature_selection_scRNA, deconvolution_rna, 
           preprocessing_mixMET,feature_selection_mixMET, 
           preprocessing_MET, feature_selection_MET, deconvolution_met, 
           late_integration, .groups = "keep") %>% 
  summarise(GlobalScore = median(score_aggreg)) %>%
  arrange(desc(GlobalScore)) 
#> `summarise()` has grouped output by 'preprocessing_mixRNA',
#> 'feature_selection_mixRNA', 'preprocessing_RNA', 'feature_selection_RNA',
#> 'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna',
#> 'preprocessing_mixMET', 'feature_selection_mixMET', 'preprocessing_MET',
#> 'feature_selection_MET', 'deconvolution_met', 'late_integration'. You can
#> override using the `.groups` argument.



# Optional: reorder factors
all_data_used <- c('dataset', 'ref')
for (data_used in all_data_used) {
  results_li[[data_used]] <- factor(results_li[[data_used]], levels = unique(results_li[[data_used]]))
}

# Optional: order other factors based on performance on 'invitro1'
if ("invitro1" %in% results_li$dataset) {
  all_functions_li <- c(
    'preprocessing_mixRNA', 'feature_selection_mixRNA',
    'preprocessing_RNA', 'feature_selection_RNA',
    'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna',
    'preprocessing_mixMET', 'feature_selection_mixMET',
    'preprocessing_MET', 'feature_selection_MET', 'deconvolution_met',
    'late_integration'
  )
  for (fun in all_functions_li) {
    results_li[[fun]] <- factor(results_li[[fun]],
      levels = unique(results_li[[fun]][order(results_li$score_aggreg[results_li$dataset == 'invitro1'], decreasing = TRUE)]))
  }
}

# Write compressed output
write.csv(results_li, file = gzfile("results_li.csv.gz"), row.names = FALSE)

index_aggreg <- which(names(results_li) == "score_aggreg")
#> Warning in instance$preRenderHook(instance): It seems your data is too big for
#> client-side DataTables. You may consider server-side processing:
#> https://rstudio.github.io/DT/server.html

3 Early integration_table

4 Nico test visu (GPT…)

multi_level_cols <- sapply(results_li[all_functions_li], function(x) nlevels(x) > 1)
filtered_functions <- all_functions_li[multi_level_cols]
dropped_cols <- all_functions_li[!multi_level_cols]
cat("Dropped columns (only 1 level):", paste(dropped_cols, collapse = ", "), "\n")
#> Dropped columns (only 1 level):


# Now run the model
model <- lm(score_aggreg ~ ., data = results_li[, c(filtered_functions, "score_aggreg")])
summary(model)
#> 
#> Call:
#> lm(formula = score_aggreg ~ ., data = results_li[, c(filtered_functions, 
#>     "score_aggreg")])
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.53087 -0.04254  0.00356  0.04958  0.42686 
#> 
#> Coefficients: (22 not defined because of singularities)
#>                                              Estimate Std. Error t value
#> (Intercept)                                 6.546e-01  1.151e-03 568.673
#> preprocessing_mixRNAppID                   -2.539e-02  5.743e-04 -44.201
#> preprocessing_mixRNAScale                  -6.536e-03  5.743e-04 -11.381
#> preprocessing_mixRNACPM                    -6.343e-03  5.743e-04 -11.044
#> preprocessing_mixRNAnopp                    2.760e-02  6.558e-03   4.209
#> feature_selection_mixRNAfsID                1.258e-03  9.948e-04   1.265
#> feature_selection_mixRNAscSPLSDA           -3.294e-02  7.034e-04 -46.832
#> feature_selection_mixRNAnofs                       NA         NA      NA
#> preprocessing_RNAppID                              NA         NA      NA
#> preprocessing_RNAScale                             NA         NA      NA
#> preprocessing_RNACPM                               NA         NA      NA
#> preprocessing_RNAnopp                              NA         NA      NA
#> feature_selection_RNAfsID                          NA         NA      NA
#> feature_selection_RNAscSPLSDA                      NA         NA      NA
#> feature_selection_RNAnofs                          NA         NA      NA
#> preprocessing_scRNAscCCAintegration         3.253e-03  9.948e-04   3.270
#> preprocessing_scRNAsccluster                3.253e-03  9.305e-04   3.496
#> preprocessing_scRNAsc_binarypseudobulk_log  1.853e-02  9.948e-04  18.624
#> preprocessing_scRNACPM                     -6.179e-18  9.948e-04   0.000
#> preprocessing_scRNALogNorm                 -4.316e-18  9.948e-04   0.000
#> preprocessing_scRNAScale                   -4.926e-18  9.948e-04   0.000
#> preprocessing_scRNAppID                    -4.440e-18  9.948e-04   0.000
#> preprocessing_scRNAscScale                         NA         NA      NA
#> preprocessing_scRNAnopp                            NA         NA      NA
#> feature_selection_scRNAscSPLSDA             3.097e-18  7.034e-04   0.000
#> feature_selection_scRNAfsID                        NA         NA      NA
#> feature_selection_scRNAnofs                        NA         NA      NA
#> deconvolution_rnaRLR                        4.862e-02  7.598e-04  63.990
#> deconvolution_rnaRLRnnls                   -4.363e-03  7.598e-04  -5.742
#> deconvolution_rnaepic                       6.462e-03  7.598e-04   8.505
#> deconvolution_rnaRLRpoisson                 6.038e-02  7.598e-04  79.467
#> deconvolution_rnannls                      -9.064e-03  7.598e-04 -11.930
#> deconvolution_rnannlslargeref              -6.391e-02  7.598e-04 -84.120
#> deconvolution_rnanode                              NA         NA      NA
#> preprocessing_mixMETScale                  -9.697e-03  5.775e-04 -16.792
#> preprocessing_mixMETnormalize              -9.697e-03  5.775e-04 -16.792
#> preprocessing_mixMETLogNorm                -1.294e-02  5.775e-04 -22.414
#> preprocessing_mixMETnopp                   -1.226e-01  2.001e-03 -61.264
#> feature_selection_mixMETnofs                       NA         NA      NA
#> preprocessing_METScale                             NA         NA      NA
#> preprocessing_METnormalize                         NA         NA      NA
#> preprocessing_METLogNorm                           NA         NA      NA
#> preprocessing_METnopp                              NA         NA      NA
#> feature_selection_METnofs                          NA         NA      NA
#> deconvolution_metRLRnnls                   -4.510e-04  7.639e-04  -0.590
#> deconvolution_metnnls                      -4.510e-04  7.639e-04  -0.590
#> deconvolution_metnnlslargeref              -4.510e-04  7.639e-04  -0.590
#> deconvolution_metRLR                        2.229e-02  7.639e-04  29.181
#> deconvolution_metRLRpoisson                 1.860e-02  7.639e-04  24.352
#> deconvolution_metepic                      -2.162e-02  7.639e-04 -28.305
#> deconvolution_metnode                              NA         NA      NA
#> late_integrationlimean                     -6.239e-09  5.003e-04   0.000
#> late_integrationlimeanRMSE                  2.229e-02  5.003e-04  44.551
#> late_integrationOnlyRna                            NA         NA      NA
#> late_integrationOnlyMet                            NA         NA      NA
#>                                            Pr(>|t|)    
#> (Intercept)                                 < 2e-16 ***
#> preprocessing_mixRNAppID                    < 2e-16 ***
#> preprocessing_mixRNAScale                   < 2e-16 ***
#> preprocessing_mixRNACPM                     < 2e-16 ***
#> preprocessing_mixRNAnopp                   2.57e-05 ***
#> feature_selection_mixRNAfsID               0.205954    
#> feature_selection_mixRNAscSPLSDA            < 2e-16 ***
#> feature_selection_mixRNAnofs                     NA    
#> preprocessing_RNAppID                            NA    
#> preprocessing_RNAScale                           NA    
#> preprocessing_RNACPM                             NA    
#> preprocessing_RNAnopp                            NA    
#> feature_selection_RNAfsID                        NA    
#> feature_selection_RNAscSPLSDA                    NA    
#> feature_selection_RNAnofs                        NA    
#> preprocessing_scRNAscCCAintegration        0.001074 ** 
#> preprocessing_scRNAsccluster               0.000472 ***
#> preprocessing_scRNAsc_binarypseudobulk_log  < 2e-16 ***
#> preprocessing_scRNACPM                     1.000000    
#> preprocessing_scRNALogNorm                 1.000000    
#> preprocessing_scRNAScale                   1.000000    
#> preprocessing_scRNAppID                    1.000000    
#> preprocessing_scRNAscScale                       NA    
#> preprocessing_scRNAnopp                          NA    
#> feature_selection_scRNAscSPLSDA            1.000000    
#> feature_selection_scRNAfsID                      NA    
#> feature_selection_scRNAnofs                      NA    
#> deconvolution_rnaRLR                        < 2e-16 ***
#> deconvolution_rnaRLRnnls                   9.37e-09 ***
#> deconvolution_rnaepic                       < 2e-16 ***
#> deconvolution_rnaRLRpoisson                 < 2e-16 ***
#> deconvolution_rnannls                       < 2e-16 ***
#> deconvolution_rnannlslargeref               < 2e-16 ***
#> deconvolution_rnanode                            NA    
#> preprocessing_mixMETScale                   < 2e-16 ***
#> preprocessing_mixMETnormalize               < 2e-16 ***
#> preprocessing_mixMETLogNorm                 < 2e-16 ***
#> preprocessing_mixMETnopp                    < 2e-16 ***
#> feature_selection_mixMETnofs                     NA    
#> preprocessing_METScale                           NA    
#> preprocessing_METnormalize                       NA    
#> preprocessing_METLogNorm                         NA    
#> preprocessing_METnopp                            NA    
#> feature_selection_METnofs                        NA    
#> deconvolution_metRLRnnls                   0.554915    
#> deconvolution_metnnls                      0.554915    
#> deconvolution_metnnlslargeref              0.554915    
#> deconvolution_metRLR                        < 2e-16 ***
#> deconvolution_metRLRpoisson                 < 2e-16 ***
#> deconvolution_metepic                       < 2e-16 ***
#> deconvolution_metnode                            NA    
#> late_integrationlimean                     0.999990    
#> late_integrationlimeanRMSE                  < 2e-16 ***
#> late_integrationOnlyRna                          NA    
#> late_integrationOnlyMet                          NA    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.07673 on 142907 degrees of freedom
#> Multiple R-squared:  0.2709, Adjusted R-squared:  0.2708 
#> F-statistic:  1660 on 32 and 142907 DF,  p-value: < 2.2e-16
anova_results <- anova(model)
anova_results <- anova_results[order(-anova_results$`Sum Sq`), ]
print(anova_results)
#> Analysis of Variance Table
#> 
#> Response: score_aggreg
#>                              Df Sum Sq Mean Sq  F value Pr(>F)    
#> Residuals                142907 841.44   0.006                    
#> deconvolution_rna             6 204.60  34.099 5791.293 <2e-16 ***
#> preprocessing_mixMET          4  29.06   7.266 1234.049 <2e-16 ***
#> deconvolution_met             6  25.53   4.255  722.707 <2e-16 ***
#> feature_selection_mixRNA      2  22.37  11.183 1899.310 <2e-16 ***
#> late_integration              2  15.58   7.791 1323.201 <2e-16 ***
#> preprocessing_mixRNA          4  13.01   3.252  552.332 <2e-16 ***
#> preprocessing_scRNA           7   2.56   0.365   62.066 <2e-16 ***
#> feature_selection_scRNA       1   0.00   0.000    0.000      1    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Get coefficients sorted by value
coefs <- coef(model)
coefs <- coefs[!grepl("(Intercept)", names(coefs))]
coefs_sorted <- sort(coefs, decreasing = TRUE)

# Top positive and negative functions
head(coefs_sorted, 10)  # most positive
#>                deconvolution_rnaRLRpoisson 
#>                                0.060376812 
#>                       deconvolution_rnaRLR 
#>                                0.048618243 
#>                   preprocessing_mixRNAnopp 
#>                                0.027601578 
#>                       deconvolution_metRLR 
#>                                0.022291250 
#>                 late_integrationlimeanRMSE 
#>                                0.022290777 
#>                deconvolution_metRLRpoisson 
#>                                0.018602474 
#> preprocessing_scRNAsc_binarypseudobulk_log 
#>                                0.018526968 
#>                      deconvolution_rnaepic 
#>                                0.006461853 
#>               preprocessing_scRNAsccluster 
#>                                0.003253401 
#>        preprocessing_scRNAscCCAintegration 
#>                                0.003253401
tail(coefs_sorted, 10)  # most negative
#>        preprocessing_mixRNAScale            deconvolution_rnannls 
#>                     -0.006536359                     -0.009064465 
#>        preprocessing_mixMETScale    preprocessing_mixMETnormalize 
#>                     -0.009696672                     -0.009696672 
#>      preprocessing_mixMETLogNorm            deconvolution_metepic 
#>                     -0.012943282                     -0.021622150 
#>         preprocessing_mixRNAppID feature_selection_mixRNAscSPLSDA 
#>                     -0.025386243                     -0.032942080 
#>    deconvolution_rnannlslargeref         preprocessing_mixMETnopp 
#>                     -0.063912036                     -0.122606269
# Load required package
library(fastDummies)

# Use only multi-level function columns (from before)
df_pca <- results_li[, c(filtered_functions, "score_aggreg")]

# One-hot encode the factor columns
df_pca_encoded <- dummy_cols(df_pca, select_columns = filtered_functions, remove_first_dummy = TRUE, remove_selected_columns = TRUE)
# Run PCA on all features (excluding score_aggreg)
pca_result <- prcomp(df_pca_encoded[, !colnames(df_pca_encoded) %in% "score_aggreg"], center = TRUE, scale. = TRUE)

# View explained variance
summary(pca_result)
#> Importance of components:
#>                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
#> Standard deviation     2.8301 2.4599 2.23142 1.63273 1.63273 1.62980 1.62980
#> Proportion of Variance 0.1483 0.1120 0.09221 0.04937 0.04937 0.04919 0.04919
#> Cumulative Proportion  0.1483 0.2604 0.35258 0.40195 0.45132 0.50051 0.54970
#>                            PC8    PC9   PC10   PC11   PC12   PC13   PC14
#> Standard deviation     1.44262 1.2209 1.0800 1.0800 1.0800 1.0800 1.0800
#> Proportion of Variance 0.03854 0.0276 0.0216 0.0216 0.0216 0.0216 0.0216
#> Cumulative Proportion  0.58824 0.6158 0.6374 0.6590 0.6806 0.7022 0.7238
#>                           PC15    PC16    PC17    PC18    PC19   PC20   PC21
#> Standard deviation     1.07907 1.07907 1.07907 1.07907 1.07907 1.0444 1.0444
#> Proportion of Variance 0.02156 0.02156 0.02156 0.02156 0.02156 0.0202 0.0202
#> Cumulative Proportion  0.74541 0.76697 0.78853 0.81010 0.83166 0.8519 0.8721
#>                          PC22   PC23   PC24    PC25    PC26    PC27    PC28
#> Standard deviation     1.0444 1.0444 1.0444 0.92890 0.81625 0.81314 0.70407
#> Proportion of Variance 0.0202 0.0202 0.0202 0.01598 0.01234 0.01224 0.00918
#> Cumulative Proportion  0.8923 0.9125 0.9327 0.94864 0.96098 0.97322 0.98240
#>                           PC29    PC30    PC31    PC32      PC33      PC34
#> Standard deviation     0.68807 0.40819 0.40744 0.37982 4.854e-15 4.341e-15
#> Proportion of Variance 0.00877 0.00309 0.00307 0.00267 0.000e+00 0.000e+00
#> Cumulative Proportion  0.99117 0.99425 0.99733 1.00000 1.000e+00 1.000e+00
#>                             PC35      PC36      PC37      PC38      PC39
#> Standard deviation     3.948e-15 3.035e-15 9.559e-16 8.283e-16 3.638e-16
#> Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
#> Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
#>                             PC40      PC41      PC42      PC43      PC44
#> Standard deviation     2.533e-16 2.457e-16 2.457e-16 2.457e-16 2.457e-16
#> Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
#> Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
#>                             PC45      PC46      PC47      PC48      PC49
#> Standard deviation     2.457e-16 2.457e-16 2.457e-16 2.457e-16 2.457e-16
#> Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
#> Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
#>                             PC50      PC51      PC52      PC53      PC54
#> Standard deviation     2.457e-16 2.457e-16 2.457e-16 2.457e-16 8.894e-17
#> Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
#> Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00

# Scree plot
plot(pca_result, type = "l", main = "Scree Plot")

# Use ggplot2 for PCA biplot with color = score_aggreg
library(ggplot2)

# Extract PCA coordinates
pca_data <- as.data.frame(pca_result$x)
pca_data$score_aggreg <- df_pca_encoded$score_aggreg

# ggplot(pca_data, aes(x = PC1, y = PC2, color = score_aggreg)) +
#   geom_point(size = 3) +
#   scale_color_gradient(low = "blue", high = "red") +
#   theme_minimal() +
#   labs(title = "PCA of Function Combinations Colored by score_aggreg")

  ggplot(pca_data, aes(x = PC1, y = score_aggreg)) +
  geom_point(size = 3, color = "steelblue") +
  theme_minimal() +
  labs(title = "PC1 vs score_aggreg")
# Get the proportion of variance explained
explained_var <- summary(pca_result)$importance["Proportion of Variance", ]

meaningful_pcs <- names(explained_var[explained_var > 0])

n_pcs_to_show <- min(length(meaningful_pcs), 3)
meaningful_pcs <- meaningful_pcs[1:n_pcs_to_show]


for (pc in meaningful_pcs) {
  cat("\nTop contributing variables to", pc, ":\n")
  loadings <- sort(abs(pca_result$rotation[, pc]), decreasing = TRUE)
  print(head(loadings, 10))
}
#> 
#> Top contributing variables to PC1 :
#>        preprocessing_RNA_nopp    feature_selection_RNA_nofs 
#>                     0.3532744                     0.3532744 
#>      preprocessing_scRNA_nopp  feature_selection_scRNA_nofs 
#>                     0.3532744                     0.3532744 
#>        deconvolution_rna_node feature_selection_mixRNA_nofs 
#>                     0.3532744                     0.3532744 
#>      late_integration_OnlyMet     preprocessing_mixRNA_nopp 
#>                     0.3532744                     0.3532744 
#>  feature_selection_scRNA_fsID    feature_selection_RNA_fsID 
#>                     0.0152800                     0.0152800 
#> 
#> Top contributing variables to PC2 :
#>      preprocessing_mixMET_nopp  feature_selection_mixMET_nofs 
#>                     0.40633984                     0.40633984 
#>         preprocessing_MET_nopp     feature_selection_MET_nofs 
#>                     0.40633984                     0.40633984 
#>         deconvolution_met_node       late_integration_OnlyRna 
#>                     0.40633984                     0.40633984 
#>        late_integration_limean    late_integration_limeanRMSE 
#>                     0.03393037                     0.03393037 
#>    preprocessing_MET_normalize preprocessing_mixMET_normalize 
#>                     0.02851307                     0.02851307 
#> 
#> Top contributing variables to PC3 :
#>        feature_selection_RNA_fsID      feature_selection_scRNA_fsID 
#>                         0.4147463                         0.4147463 
#>     feature_selection_mixRNA_fsID     preprocessing_scRNA_sccluster 
#>                         0.4147463                         0.3566051 
#> feature_selection_mixRNA_scSPLSDA    feature_selection_RNA_scSPLSDA 
#>                         0.2971756                         0.2971756 
#>  feature_selection_scRNA_scSPLSDA       preprocessing_scRNA_LogNorm 
#>                         0.2564685                         0.1480072 
#>           preprocessing_scRNA_CPM          preprocessing_scRNA_ppID 
#>                         0.1480072                         0.1480072

if (length(meaningful_pcs) == 0) {
  cat("No meaningful principal components found (explained variance is zero).")
}

5 Visualisations of the top methods

5.1 top 5 best methods